# Setup Célula oculta para o setup:
Os dados são obtidos de um servidor SQL ou csv e armazenados na variável df
df <- read_csv("/media/njaneto/HD1/FIAP/PROGRAMANDO_IA_COM_R/fifa18-data-analysis/model/data/fifa18.csv", locale = locale(encoding = "ISO-8859-1"))
New names:
* `` -> ...1
Registered S3 method overwritten by 'cli':
method from
print.tree tree
Rows: 17994 Columns: 58
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): name, full_name, club, league, nationality, Position, work_rate_att, work_rate_def, preferred_foot
dbl (49): ...1, ID, special, eur_value, eur_wage, eur_release_clause, crossing, finishing, heading_accuracy, short_...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
setDT(df)
Quantidade de linhas e colunas obtidas:
dim(df)
[1] 17994 58
Primeiros registros obtidos:
head(df)
Incluindo coluna de continente
Africa<-c('Algeria','Angola','Benin','Botswana','Burkina','Burundi','Cameroon','Cape Verde','Central African Republic','Chad','Comoros','Congo','Congo Democratic Republic of','Djibouti','Egypt','Equatorial Guinea','Eritrea','Ethiopia','Gabon','Gambia','Ghana','Guinea','Guinea-Bissau','Ivory Coast','Kenya','Lesotho','Liberia','Libya','Madagascar','Malawi','Mali','Mauritania','Mauritius','Morocco','Mozambique','Namibia','Niger','Nigeria','Rwanda','Sao Tome and Principe','Senegal','Seychelles','Sierra Leone','Somalia','South Africa','South Sudan','Sudan','Swaziland','Tanzania','Togo','Tunisia','Uganda','Zambia','Zimbabwe','Burkina Faso')
Antarctica<-c('Fiji','Kiribati','Marshall Islands','Micronesia','Nauru','New Zealand','Palau','Papua New Guinea','Samoa','Solomon Islands','Tonga','Tuvalu','Vanuatu')
Asia<-c('Afghanistan','Bahrain','Bangladesh','Bhutan','Brunei','Burma (Myanmar)','Cambodia','China','East Timor','India','Indonesia','Iran','Iraq','Israel','Japan','Jordan','Kazakhstan','North Korea','South Korea','Kuwait','Kyrgyzstan','Laos','Lebanon','Malaysia','Maldives','Mongolia','Nepal','Oman','Pakistan','Philippines','Qatar','Russian Federation','Saudi Arabia','Singapore','Sri Lanka','Syria','Tajikistan','Thailand','Turkey','Turkmenistan','United Arab Emirates','Uzbekistan','Vietnam','Yemen','Russia')
Europe<-c('Albania','Andorra','Armenia','Austria','Azerbaijan','Belarus','Belgium','Bosnia and Herzegovina','Bulgaria','Croatia','Cyprus','Czech Republic','Denmark','Estonia','Finland','France','Georgia','Germany','Greece','Hungary','Iceland','Ireland','Italy','Latvia','Liechtenstein','Lithuania','Luxembourg','Macedonia','Malta','Moldova','Monaco','Montenegro','Netherlands','Norway','Poland','Portugal','Romania','San Marino','Scotland','Serbi','Slovakia','Slovenia','Spain','Sweden','Switzerland','Ukraine','England','Vatican City','Republic of Ireland','Wales')
North_america<-c('Antigua and Barbuda','Bahamas','Barbados','Belize','Canada','Costa Rica','Cuba','Dominica','Dominican Republic','El Salvador','Grenada','Guatemala','Haiti','Honduras','Jamaica','Mexico','Nicaragua','Panama','Saint Kitts and Nevis','Saint Lucia','Saint Vincent and the Grenadines','Trinidad and Tobago','United States')
South_america<-c('Argentina','Bolivia','Brazil','Chile','Colombia','Ecuador','Guyana','Paraguay','Peru','Suriname','Uruguay','Venezuela')
df[, continent:= df$nationality]
df <- df %>% relocate(continent, .after = nationality)
df$continent[df$continent %in% Africa ] <- "Africa"
df$continent[df$continent %in% Antarctica ] <- "Antarctica"
df$continent[df$continent %in% Asia ] <- "Asia"
df$continent[df$continent %in% Europe ] <- "Europe"
df$continent[df$continent %in% North_america ] <- "North_america"
df$continent[df$continent %in% South_america ] <- "South_america"
Validação incial de dados
plot_intro(df)
plot_missing(df)
Jogadores por pais
pais <- df[,.N, by=nationality]
fr <- joinCountryData2Map(dF=pais, joinCode = "NAME", nameJoinColumn = "nationality", verbose = F)
148 codes from your data successfully matched countries in the map
16 codes from your data failed to match with a country code in the map
95 codes from the map weren't represented in your data
mapCountryData(mapToPlot = fr,nameColumnToPlot = "N",catMethod = "fixedWidth",
oceanCol = "steelblue1",missingCountryCol = "white",
mapTitle = "Jogadores por pais",
aspect = "variable")
Warning in plot.window(xlim = xlim, ylim = ylim, asp = aspect) :
NAs introduzidos por coerção
selecionando os atacantes
OBACK_EUROPE <- df %>%
filter(Position == "Outside-back" & continent == "Europe")
head(OBACK_EUROPE)
selecionando os goleiros da variável
#-- base para validacao
OBACK_NOT_EUROPE <- df %>%
filter(Position == "Outside-back" & continent != "Europe")
head(OBACK_NOT_EUROPE)
removendo os NAs e variaves nao numericas da base de treino
OBACK_EUROPE <- OBACK_EUROPE %>%
select_if(~ !any(is.na(.))) %>%
select_if(~ any(is.numeric(.)))
head(OBACK_EUROPE)
Removendo os NAs e variaves nao numericas da base de validacao
fifa.18.ob <- OBACK_NOT_EUROPE %>%
select_if(~ !any(is.na(.))) %>%
select_if(~ any(is.numeric(.)))
head(fifa.18.ob)
Grafico de boxplot simples
boxplot(OBACK_EUROPE)
Grafico de Correlacao simples
corrMatrix <- cor(OBACK_EUROPE)
corrplot.mixed(corrMatrix,
lower = "ellipse",
upper = "number",
tl.pos = "lt",
tl.col = "black",
order="hclust",
hclust.method = "ward.D",
addrect = 3)
Modelo com Floresta
set.seed(1)
reg.test <- randomForest(formula = eur_value ~ .,
data = OBACK_EUROPE,
ntree=100,
proximity=TRUE,
localImp=TRUE)
plot(reg.test)
Predição dos preços dos goleiros
predito = predict(reg.test, OBACK_EUROPE)
print(paste("R2: ", R2_Score(predito, OBACK_EUROPE$eur_value) ) )
[1] "R2: 0.991296178162238"
print(paste("MSE: ", MSE(predito, OBACK_EUROPE$eur_value) ) )
[1] "MSE: 90733971182.37"
OBACK_EUROPE[, predito:=predito]
OBACK_EUROPE <- OBACK_EUROPE %>% relocate(predito, .after = eur_value)
head(OBACK_EUROPE)
Avaliaçao de acerto X erro do modelo
OBACK_EUROPE %>%
mutate(predito = predict(reg.test, .)) %>%
plot_ly(x = ~eur_value,
y= ~predito,
type='scatter',
mode='markers',
text=~paste0("Real value: ", currency(eur_value, symbol='€', digits = 0L),
"\nPredicted value: ", currency(predito, symbol='€', digits = 0L),
"\nError: ", (eur_value - predito)),
name="Dispersão") %>%
add_segments(x=0, y=0, xend = 100000000, yend = 100000000, name="Equilíbrio")
Predição dos preços dos goleiros
predito = predict(reg.test, fifa.18.ob)
print(paste("R2: ", R2_Score(predito, fifa.18.ob$eur_value) ) )
[1] "R2: 0.883693708450125"
print(paste("MSE: ", MSE(predito, fifa.18.ob$eur_value) ) )
[1] "MSE: 932950885137.678"
fifa.18.ob[, predito:=predito]
fifa.18.ob <- fifa.18.ob %>% relocate(predito, .after = eur_value)
head(fifa.18.ob)
fifa.18.ob %>%
mutate(predito = predict(reg.test, .)) %>%
plot_ly(x = ~eur_value,
y= ~predito,
type='scatter',
mode='markers',
text=~paste0("Real value: ", currency(eur_value, symbol='€', digits = 0L),
"\nPredicted value: ", currency(predito, symbol='€', digits = 0L),
"\nError: ", (eur_value - predito)),
name="Dispersão") %>%
add_segments(x=0, y=0, xend = 100000000, yend = 100000000, name="Equilíbrio")
output <- OBACK_NOT_EUROPE %>%
select(Position, name, eur_value)
output[, eur_value := currency(fifa.18.ob$eur_value, symbol = '€', digits = 0L)]
output[, 'Preço "Calculado" (€)' := currency(fifa.18.ob$predito, symbol = '€', digits = 0L)]
output[, 'Potencial Valorização (€)' := currency((fifa.18.ob$predito - fifa.18.ob$eur_value), symbol='€', digits = 0L) ]
output[, 'Potencial Valorização (%)' := (percent((fifa.18.ob$predito - fifa.18.ob$eur_value) / 100000000)) ]
output <- output %>%
rename(
'Posição' = Position,
'Jogador' = name,
'Preço de mercado' = eur_value
)
head(output)
NA
Pedro Albuquerque - São Paulo - 2021